library(tidyverse)
library(nycflights13)
library(showtext)
font_add_google("Montserrat") # the closest to LBS font-- can get more fonts from fonts.google.com
showtext_auto(enable = TRUE)flights %>%
mutate(cancelled = ifelse(is.na(dep_time), TRUE, FALSE)) %>%
group_by(month, cancelled) %>%
summarise(count = n()) %>%
mutate(prop = count / sum(count))## # A tibble: 24 × 4
## # Groups: month [12]
## month cancelled count prop
## <int> <lgl> <int> <dbl>
## 1 1 FALSE 26483 0.981
## 2 1 TRUE 521 0.0193
## 3 2 FALSE 23690 0.949
## 4 2 TRUE 1261 0.0505
## 5 3 FALSE 27973 0.970
## 6 3 TRUE 861 0.0299
## 7 4 FALSE 27662 0.976
## 8 4 TRUE 668 0.0236
## 9 5 FALSE 28233 0.980
## 10 5 TRUE 563 0.0196
## # ℹ 14 more rows
# does it matter how we group_by()? Of course it does!
flights %>%
mutate(cancelled = ifelse(is.na(dep_time), TRUE, FALSE)) %>%
group_by(cancelled, month) %>%
summarise(count = n()) %>%
mutate(prop = count / sum(count)) ## # A tibble: 24 × 4
## # Groups: cancelled [2]
## cancelled month count prop
## <lgl> <int> <int> <dbl>
## 1 FALSE 1 26483 0.0806
## 2 FALSE 2 23690 0.0721
## 3 FALSE 3 27973 0.0851
## 4 FALSE 4 27662 0.0842
## 5 FALSE 5 28233 0.0859
## 6 FALSE 6 27234 0.0829
## 7 FALSE 7 28485 0.0867
## 8 FALSE 8 28841 0.0878
## 9 FALSE 9 27122 0.0826
## 10 FALSE 10 28653 0.0872
## # ℹ 14 more rows
Plot cancellation % by month
flights %>%
# find flights that were cancelled, ie, those with no departure time
mutate(cancelled = ifelse(is.na(dep_time), TRUE, FALSE)) %>%
# count how many cancellations/normal flights we had per month
group_by(month, cancelled) %>%
summarise(count = n()) %>%
# use mutate to create a new column that calculates %
mutate(prop = count / sum(count)) %>%
# only show those flights that were cancelled
filter(cancelled == TRUE) %>%
# pipe the resulting dataframe to ggpplot
ggplot() +
# add global aesthetics
aes(x=factor(month), y = prop)+
#just use a bar chart
geom_col()+
# change the theme, to theme_light()
theme_light()+
# format y-axis labels as, e.g., 15% and not 0.15
scale_y_continuous(labels = scales::percent)# how about we do it by airport?
flights %>%
mutate(cancelled = ifelse(is.na(dep_time), TRUE, FALSE)) %>%
group_by(origin, month, cancelled) %>%
summarise(count = n()) %>%
mutate(prop = count / sum(count)) %>%
filter(cancelled == TRUE) %>%
ggplot() +
aes(x=factor(month), y = prop)+
geom_col()+
theme_light()+
scale_y_continuous(labels = scales::percent)+
facet_wrap(~origin, ncol = 1)Please left_join() the resulting table with the table planes (also included in the nycflights13 package).
glimpse(flights)## Rows: 336,776
## Columns: 19
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1…
## $ dep_time <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
## $ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
## $ dep_delay <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
## $ arr_time <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
## $ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
## $ arr_delay <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…
## $ carrier <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
## $ flight <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 4…
## $ tailnum <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N394…
## $ origin <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA",…
## $ dest <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
## $ air_time <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 1…
## $ distance <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, …
## $ hour <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6…
## $ minute <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0…
## $ time_hour <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 0…
glimpse(planes)## Rows: 3,322
## Columns: 9
## $ tailnum <chr> "N10156", "N102UW", "N103US", "N104UW", "N10575", "N105UW…
## $ year <int> 2004, 1998, 1999, 1999, 2002, 1999, 1999, 1999, 1999, 199…
## $ type <chr> "Fixed wing multi engine", "Fixed wing multi engine", "Fi…
## $ manufacturer <chr> "EMBRAER", "AIRBUS INDUSTRIE", "AIRBUS INDUSTRIE", "AIRBU…
## $ model <chr> "EMB-145XR", "A320-214", "A320-214", "A320-214", "EMB-145…
## $ engines <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ seats <int> 55, 182, 182, 182, 55, 182, 182, 182, 182, 182, 55, 55, 5…
## $ speed <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
## $ engine <chr> "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turbo-fan", "Turb…
flights %>%
# left join table flights with planes, that has all the info on airplanes
left_join(planes, by="tailnum") %>%
# for a single variable, count() is the same as group_by() and then summarise()
# if we add `sort=TRUE`, we get the table sorted in descending order (max to min)
count(tailnum, sort=TRUE)## # A tibble: 4,044 × 2
## tailnum n
## <chr> <int>
## 1 <NA> 2512
## 2 N725MQ 575
## 3 N722MQ 513
## 4 N723MQ 507
## 5 N711MQ 486
## 6 N713MQ 483
## 7 N258JB 427
## 8 N298JB 407
## 9 N353JB 404
## 10 N351JB 402
## # ℹ 4,034 more rows
which_plane <- flights %>%
left_join(planes, by="tailnum") %>%
filter(seats >= 50) %>%
count(tailnum, sort = TRUE) %>%
select(tailnum) %>%
# slice(1) will pull the first, highest number of flights
slice(1) %>%
# pull() creates a vector; if we dont use pull() we get a dataframe
pull()
which_plane## [1] "N328AA"
flights %>%
filter(tailnum == which_plane) %>%
count(origin, dest, sort=TRUE)## # A tibble: 6 × 3
## origin dest n
## <chr> <chr> <int>
## 1 JFK LAX 313
## 2 JFK SFO 52
## 3 JFK MIA 25
## 4 JFK BOS 1
## 5 JFK MCO 1
## 6 JFK SJU 1
Problem 4: The nycflights13 package includes a table (weather) that describes the weather during 2013. Use that table to answer the following questions:
temp) in July 2013? Identify
any important outliers in terms of the wind_speedvariable.
weather %>%
filter(month == 7) %>%
ggplot() +
aes(x= temp)+
geom_histogram()# looking at all months
weather %>%
# filter(month == 7) %>%
ggplot() +
aes(x= temp)+
geom_histogram() +
facet_wrap(~month, nrow=4)+
theme_light()# facet_wrap() for all months' wind_speed
weather %>%
ggplot() +
aes(x= wind_speed)+
geom_histogram()+
facet_wrap(~month,
ncol = 3, # to get 3 columns
scales = "free")+ # easy to check outliers, by having ggplot automatically adjust scales on axes
theme_light()+
theme(text=element_text(size=12, family="Montserrat"))dewp and
humid?precip and
visib?library(GGally) # package to plot scatterplot- correlation matrix
glimpse(weather)## Rows: 26,115
## Columns: 15
## $ origin <chr> "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR", "EW…
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013,…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ day <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ hour <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, …
## $ temp <dbl> 39.0, 39.0, 39.0, 39.9, 39.0, 37.9, 39.0, 39.9, 39.9, 41.0,…
## $ dewp <dbl> 26.06, 26.96, 28.04, 28.04, 28.04, 28.04, 28.04, 28.04, 28.…
## $ humid <dbl> 59.4, 61.6, 64.4, 62.2, 64.4, 67.2, 64.4, 62.2, 62.2, 59.6,…
## $ wind_dir <dbl> 270, 250, 240, 250, 260, 240, 240, 250, 260, 260, 260, 330,…
## $ wind_speed <dbl> 10.36, 8.06, 11.51, 12.66, 12.66, 11.51, 14.96, 10.36, 14.9…
## $ wind_gust <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 20.…
## $ precip <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
## $ pressure <dbl> 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1012, 1013, 1012,…
## $ visib <dbl> 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,…
## $ time_hour <dttm> 2013-01-01 01:00:00, 2013-01-01 02:00:00, 2013-01-01 03:00…
weather %>%
select(dewp, humid, precip, visib, temp) %>%
ggpairs()+
theme_bw()Problem 5: Use the flights and planes tables to answer the following questions:
# to get the dataframe
planes %>%
filter(is.na(year))## # A tibble: 70 × 9
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N14558 NA Fixed wing multi… EMBRAER EMB-… 2 55 NA Turbo…
## 2 N15555 NA Fixed wing multi… EMBRAER EMB-… 2 55 NA Turbo…
## 3 N15574 NA Fixed wing multi… EMBRAER EMB-… 2 55 NA Turbo…
## 4 N174US NA Fixed wing multi… AIRBUS INDU… A321… 2 199 NA Turbo…
## 5 N177US NA Fixed wing multi… AIRBUS INDU… A321… 2 199 NA Turbo…
## 6 N181UW NA Fixed wing multi… AIRBUS INDU… A321… 2 199 NA Turbo…
## 7 N18557 NA Fixed wing multi… EMBRAER EMB-… 2 55 NA Turbo…
## 8 N194UW NA Fixed wing multi… AIRBUS A321… 2 199 NA Turbo…
## 9 N238JB NA Fixed wing multi… EMBRAER ERJ … 2 20 NA Turbo…
## 10 N271LV NA Fixed wing multi… BOEING 737-… 2 149 NA Turbo…
## # ℹ 60 more rows
# to get counts
planes %>%
count(is.na(year))## # A tibble: 2 × 2
## `is.na(year)` n
## <lgl> <int>
## 1 FALSE 3252
## 2 TRUE 70
(Hint: you may need to use case_when() to recode the manufacturer name and collapse rare vendors into a category called Other.)
planes %>%
count(manufacturer, sort = TRUE)## # A tibble: 35 × 2
## manufacturer n
## <chr> <int>
## 1 BOEING 1630
## 2 AIRBUS INDUSTRIE 400
## 3 BOMBARDIER INC 368
## 4 AIRBUS 336
## 5 EMBRAER 299
## 6 MCDONNELL DOUGLAS 120
## 7 MCDONNELL DOUGLAS AIRCRAFT CO 103
## 8 MCDONNELL DOUGLAS CORPORATION 14
## 9 CANADAIR 9
## 10 CESSNA 9
## # ℹ 25 more rows
# recode the manufacturer name and collapse rare vendors into a category called `Other`)
planes <- planes %>%
mutate(recode_manufacturer = case_when( # case_when() is an elegant, multiple ifelse()
manufacturer %in% c("BOEING") ~ "Boeing",
manufacturer %in% c("AIRBUS INDUSTRIE", "AIRBUS") ~ "Airbus",
manufacturer %in% c("EMBRAER") ~ "Embraer",
manufacturer %in% c("MCDONNELL DOUGLAS", "MCDONNELL DOUGLAS AIRCRAFT CO", "MCDONNELL DOUGLAS CORPORATION" ) ~ "McDonnell Douglas",
TRUE ~ "Other" # everything else will be "Other"
)) flights %>%
left_join(planes, by = "tailnum") %>%
filter(!is.na(recode_manufacturer)) %>%
group_by(month, recode_manufacturer) %>%
summarise(n = n()) %>%
mutate(prop = n/sum(n)) %>%
ggplot()+
aes(x=factor(month),
y = prop,
group = recode_manufacturer,
colour = recode_manufacturer)+
geom_line()+
theme_light()+
theme(text=element_text(size=12, family="Montserrat"))+
scale_y_continuous(labels = scales::percent)+
NULLProblem 6: Use the flights and planes tables to answer the following questions:
flights %>%
left_join(planes, by = "tailnum") %>%
rename(year_manufactured = year.y) %>%
select(tailnum, year_manufactured) %>%
distinct(tailnum, .keep_all = TRUE) %>%
arrange(year_manufactured)## # A tibble: 4,044 × 2
## tailnum year_manufactured
## <chr> <int>
## 1 N381AA 1956
## 2 N201AA 1959
## 3 N567AA 1959
## 4 N575AA 1963
## 5 N378AA 1963
## 6 N14629 1965
## 7 N615AA 1967
## 8 N425AA 1968
## 9 N383AA 1972
## 10 N364AA 1973
## # ℹ 4,034 more rows
flights %>%
distinct(tailnum) %>%
# semi_join() return all rows from x with a match in y.
semi_join(planes)## # A tibble: 3,322 × 1
## tailnum
## <chr>
## 1 N14228
## 2 N24211
## 3 N619AA
## 4 N804JB
## 5 N668DN
## 6 N39463
## 7 N516JB
## 8 N829AS
## 9 N593JB
## 10 N793JB
## # ℹ 3,312 more rows
# quicker. how many palnes from our `planes` table appear in `flights`
planes %>%
semi_join(flights, by="tailnum")## # A tibble: 3,322 × 10
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N10156 2004 Fixed wing multi… EMBRAER EMB-… 2 55 NA Turbo…
## 2 N102UW 1998 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 3 N103US 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 4 N104UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 5 N10575 2002 Fixed wing multi… EMBRAER EMB-… 2 55 NA Turbo…
## 6 N105UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 7 N107US 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 8 N108UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 9 N109UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## 10 N110UW 1999 Fixed wing multi… AIRBUS INDU… A320… 2 182 NA Turbo…
## # ℹ 3,312 more rows
## # ℹ 1 more variable: recode_manufacturer <chr>
# An inner_join() only keeps observations from x that have a matching key in y.
flights %>%
distinct(tailnum) %>%
inner_join(planes)## # A tibble: 3,322 × 10
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N14228 1999 Fixed wing multi… BOEING 737-… 2 149 NA Turbo…
## 2 N24211 1998 Fixed wing multi… BOEING 737-… 2 149 NA Turbo…
## 3 N619AA 1990 Fixed wing multi… BOEING 757-… 2 178 NA Turbo…
## 4 N804JB 2012 Fixed wing multi… AIRBUS A320… 2 200 NA Turbo…
## 5 N668DN 1991 Fixed wing multi… BOEING 757-… 2 178 NA Turbo…
## 6 N39463 2012 Fixed wing multi… BOEING 737-… 2 191 NA Turbo…
## 7 N516JB 2000 Fixed wing multi… AIRBUS INDU… A320… 2 200 NA Turbo…
## 8 N829AS 1998 Fixed wing multi… CANADAIR CL-6… 2 55 NA Turbo…
## 9 N593JB 2004 Fixed wing multi… AIRBUS A320… 2 200 NA Turbo…
## 10 N793JB 2011 Fixed wing multi… AIRBUS A320… 2 200 NA Turbo…
## # ℹ 3,312 more rows
## # ℹ 1 more variable: recode_manufacturer <chr>
Problem 7: Use the nycflights13 to answer the following questions:
flights %>%
group_by(month, origin) %>%
summarise(mean_arrival_delay = mean(arr_delay, na.rm=TRUE),
median_arrival_delay = median(arr_delay, na.rm=TRUE)
)## # A tibble: 36 × 4
## # Groups: month [12]
## month origin mean_arrival_delay median_arrival_delay
## <int> <chr> <dbl> <dbl>
## 1 1 EWR 12.8 0
## 2 1 JFK 1.37 -7
## 3 1 LGA 3.38 -4
## 4 2 EWR 8.78 -2
## 5 2 JFK 4.39 -5
## 6 2 LGA 3.15 -4
## 7 3 EWR 10.6 -4
## 8 3 JFK 2.58 -7
## 9 3 LGA 3.74 -7
## 10 4 EWR 14.1 -1
## # ℹ 26 more rows
flights %>%
group_by(carrier, month, origin) %>%
summarise(mean_arrival_delay = mean(arr_delay, na.rm=TRUE),
median_arrival_delay = median(arr_delay, na.rm=TRUE)
) %>%
left_join(airlines, by = "carrier") %>%
rename(carrier_name = name) %>%
ggplot()+
aes(x= factor(month),
y = median_arrival_delay,
colour = origin,
group = origin) +
geom_line()+
facet_wrap(~carrier_name, scales = 'free', ncol=3)+
theme_light() +
theme(
text=element_text(size=10, family="Montserrat"))+
labs(colour = "Origin Airport")Problem 8: Let’s take a closer look at what carriers service the route to San Francisco International (SFO). Join the flights and airlines tables and count which airlines flew the most to SFO. Produce a new dataframe, fly_into_sfo that contains three variables: the name of the airline, e.g., United Air Lines Inc. not UA, the count (number) of times it flew to SFO, and the percent of the trips that that particular airline flew to SFO.
fly_into_sfofly_into_sfo <- flights %>%
left_join(airlines, by = "carrier") %>%
filter(dest == "SFO") %>%
group_by(name) %>%
summarise(count = n()) %>%
mutate(percent = round(100*count/sum(count),1))
fly_into_sfo## # A tibble: 5 × 3
## name count percent
## <chr> <int> <dbl>
## 1 American Airlines Inc. 1422 10.7
## 2 Delta Air Lines Inc. 1858 13.9
## 3 JetBlue Airways 1035 7.8
## 4 United Air Lines Inc. 6819 51.2
## 5 Virgin America 2197 16.5
# And here is some bonus ggplot code to plot your dataframe
fly_into_sfo %>%
# sort 'name' of airline by the numbers it times to flew to SFO
mutate(name = fct_reorder(name, count)) %>%
ggplot() +
aes(x = count,
y = name) +
# a simple bar/column plot
geom_col() +
# add labels, so each bar shows the % of total flights
geom_text(aes(label = percent),
hjust = 1,
colour = "white",
size = 5)+
# add labels to help our audience
labs(title="United (UA) dominates the NYC to SFO route",
subtitle = "as % of total flights in 2013",
x= "Number of flights",
y= NULL) +
theme_minimal() +
# change the theme-- i just googled those , but you can use the ggThemeAssist add-in
# https://cran.r-project.org/web/packages/ggThemeAssist/index.html
theme(#
# so title is left-aligned
plot.title.position = "plot",
# text in axes appears larger
axis.text = element_text(size=12),
# title text is bigger
plot.title = element_text(size=18),
text=element_text(size=12, family="Montserrat")
) +
# add one final layer of NULL, so if you comment out any lines
# you never end up with a hanging `+` that awaits another ggplot layer
NULLHow would you explore this data set? Here are some ideas of tables/ graphs to help you with your analysis.
# select just age_difference and use the skimr package to get summary stats
age_gaps %>%
select(age_difference) %>%
skimr::skim()| Name | Piped data |
| Number of rows | 1155 |
| Number of columns | 1 |
| _______________________ | |
| Column type frequency: | |
| numeric | 1 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| age_difference | 0 | 1 | 10.4 | 8.51 | 0 | 4 | 8 | 15 | 52 | ▇▃▂▁▁ |
# histogram
age_gaps %>%
ggplot()+
aes(x = age_difference)+
geom_histogram()+
theme_bw()# faceted by character 1 gender
age_gaps %>%
ggplot()+
aes(x = age_difference)+
geom_histogram()+
theme_bw()+
facet_wrap(~character_1_gender, ncol = 1)Large age disparities in relationships carry certain stigmas. One popular rule of thumb is the half-your-age-plus-seven rule. This rule states you should never date anyone under half your age plus seven, establishing a minimum boundary on whom one can date. In order for a dating relationship to be acceptable under this rule, your partner’s age must be:
\[\frac{\text{Your age}}{2} + 7 < \text{Partner Age} < (\text{Your age} - 7) * 2\]
How frequently does this rule apply in this dataset?
age_gaps <- age_gaps %>%
mutate(
lower_limit = actor_1_age/2 + 7,
half_plus_seven = ifelse(actor_2_age >= lower_limit ,
TRUE,
FALSE))
age_gaps %>%
count(half_plus_seven) %>%
mutate(perc = n/sum(n))## # A tibble: 2 × 3
## half_plus_seven n perc
## <lgl> <int> <dbl>
## 1 FALSE 326 0.282
## 2 TRUE 829 0.718
age_gaps %>%
group_by(movie_name) %>%
summarise(n = n(),
median_age_difference = median(age_difference, na.rm=TRUE)) %>%
arrange(desc(n))## # A tibble: 830 × 3
## movie_name n median_age_difference
## <chr> <int> <dbl>
## 1 Love Actually 7 13
## 2 The Family Stone 6 4
## 3 A View to a Kill 5 28
## 4 He's Just Not That Into You 5 3
## 5 Mona Lisa Smile 5 3
## 6 A Star Is Born 4 10
## 7 American Pie 4 5
## 8 Boogie Nights 4 6.5
## 9 Closer 4 7
## 10 Pushing Tin 4 11.5
## # ℹ 820 more rows
age_gaps %>%
count(actor_1_name, sort=TRUE)## # A tibble: 567 × 2
## actor_1_name n
## <chr> <int>
## 1 Keanu Reeves 24
## 2 Adam Sandler 18
## 3 Roger Moore 17
## 4 Sean Connery 15
## 5 Harrison Ford 13
## 6 Johnny Depp 12
## 7 Pierce Brosnan 12
## 8 Leonardo DiCaprio 11
## 9 Richard Gere 10
## 10 Brad Pitt 9
## # ℹ 557 more rows
age_gaps %>%
count(actor_2_name, sort=TRUE)## # A tibble: 647 × 2
## actor_2_name n
## <chr> <int>
## 1 Keira Knightley 13
## 2 Scarlett Johansson 13
## 3 Emma Stone 11
## 4 Renee Zellweger 11
## 5 Drew Barrymore 9
## 6 Jennifer Lawrence 9
## 7 Julia Roberts 9
## 8 Amanda Seyfried 8
## 9 Audrey Hepburn 8
## 10 Jennifer Aniston 8
## # ℹ 637 more rows
age_gaps %>%
group_by(release_year) %>%
summarise(median_difference = median(age_difference, na.rm = TRUE)) %>%
ggplot()+
aes(x=release_year, y = median_difference) +
geom_line()+
geom_smooth(se=FALSE)+
theme_light()+
theme(text=element_text(size=12, family="Montserrat"))age_gaps <- age_gaps %>%
mutate(same_gender =
ifelse(character_1_gender == character_2_gender,
TRUE,
FALSE))
age_gaps %>%
count(same_gender) %>%
mutate(prop = n/sum(n))## # A tibble: 2 × 3
## same_gender n prop
## <lgl> <int> <dbl>
## 1 FALSE 1132 0.980
## 2 TRUE 23 0.0199
age_gaps %>%
ggplot()+
aes(x = release_year, y = age_difference, colour = character_1_gender)+
geom_point(alpha = 0.6)+
facet_wrap(~same_gender, ncol=1)+
theme_bw()+
theme(text=element_text(size=12, family="Montserrat"))+
labs(
colour = "Protagonist's gender"
)age_gaps %>%
group_by(release_year,same_gender) %>%
summarise( n = n()) %>%
mutate(prop = n/sum(n)) %>%
filter(same_gender == FALSE) %>%
mutate(same_gender_prop = 1 - prop) %>%
ggplot()+
aes(x=release_year, y = same_gender_prop)+
geom_line()+
scale_y_continuous(labels = scales::percent)+
theme_light()library(plotly)
plot <- ggplot(age_gaps)+
geom_point(aes(x=actor_1_age,
y = actor_2_age,
colour = character_1_gender,
# tooltip by default shows all variables used in aes()
# to add extra text in tooltip, we use text
text = paste(actor_1_name, "and", actor_2_name, "@",movie_name,"(",release_year,")")), alpha = 0.8)+
# add a line in steelblue that shows lower limit of half plus 7 rule
geom_line(aes(x = actor_1_age, y = lower_limit), colour = "steelblue", size = 1.5)+
theme_light()+
facet_wrap(~same_gender, ncol=1)
ggplotly(plot)base_plot <- age_gaps %>%
ggplot()+
aes(x=actor_1_age,
fill = character_1_gender)+
theme_light()+
labs(
title = "Most protagonists are men; majority < 40 years",
x = "Actors' age",
fill = "Protagonist's Gender",
y = NULL
)+
theme(text=element_text(size=12, family="Montserrat"))
base_plot + geom_histogram(alpha = 0.7)base_plot + geom_density(alpha = 0.6)base_plot + geom_boxplot(alpha = 0.7) base_plot + stat_ecdf(aes(colour = character_1_gender), geom = "step", pad = FALSE) +
scale_y_continuous(labels = scales::percent)